# ==============================================================================
# file name: 03-identify-repeated-participation-YG.R
# date: Mar 12, 2024
# author: Bernhard Clemm / Tiago Ventura
# purpose: identify attempts at repeated participation (YouGov)
# THIS SCRIPT REQUIRES ACCESS TO THE RAW DATA AND SERVES FOR REFERENCE ONLY
# ==============================================================================

# DATA =========================================================================

browsing_YG <- read_rds("")

# The time range of the raw webtracking data is (31 March 2011 - 8 November 2018). \
# This includes 25,320,661 observations (website visits).
# We exclude all observations collected through proxy and VPN (18,865,151rows), 
# due to data quality issues. All observations prior to 17 September 2018 (6,564 rows) are treated as
# erroneous date-time setting and excluded from the subsequent analysis.

browsing_YG <- browsing_YG %>% filter(date > "2018-09-01")

# PREPROCESSING ================================================================

## Add duration variable ####
## cf. 02-summarize-survey-visits-YG.R

browsing_YG <- browsing_YG %>%
  mutate(created_utc = str_c(date, " ", time)) %>%
  mutate(created_utc = as.POSIXct(created_utc, format = "%Y-%m-%d %H:%M:%OS")) %>%
  group_by(caseid) %>%
  mutate(created_utc_next = dplyr::lead(created_utc, order_by = created_utc)) %>%
  ungroup() %>%
  mutate(timediff = difftime(created_utc_next, created_utc, units = "secs")) %>%
  mutate(duration_s_5_na = ifelse(timediff > 300, NA, timediff))
    
## Extract URL components ####

browsing_YG <- browsing_YG %>%
  mutate(
    url_protocol = adaR::ada_get_protocol(page_url),
    url_host = adaR::ada_get_host(page_url),
    url_path = adaR::ada_get_pathname(page_url),
    url_query = adaR::ada_get_search(page_url))

## indicator for whether visit happening directly after each other ####
browsing_YG <- browsing_YG %>%
  group_by(caseid) %>%
  mutate(url_previous = lag(page_url, order_by = start_time_utc)) %>%
  ungroup() %>%
  mutate(same_as_prev_url = ifelse(url_previous == page_url, 1, 0))

# IDENTIFY QUESTIONNAIRE VISITS ================================================

repeated_YG <- browsing_YG %>%
  filter(
    (grepl("confirmit.com$", url_host) & grepl("^/wix/[a-zA-Z0-9]", url_path)) |
      (grepl("surveygizmo.com$", url_host) & grepl("^/s3/[a-zA-Z0-9]", url_path)) |
      (grepl("surveygizmo.eu$", url_host) & grepl("^/s3/[a-zA-Z0-9]", url_path)) |
      (grepl("surveymonkey.com$", url_host) & grepl("^/r/[a-zA-Z0-9]", url_path)) |
      (grepl("qualtrics.com$", url_host) & grepl("^/jfe/form/[a-zA-Z0-9]", url_path)) |
      (grepl("survey.cmix.com$", url_host) & grepl("^/[A-Z0-9]", url_path)) |
      (grepl("questionpro.com$", url_host) & grepl("^/t/[a-zA-Z0-9]", url_path)) |
      (grepl("questionpro.com$", url_host) & grepl("/a/TakeSurvey\\?tt=[a-zA-Z0-9]", url)) |
      (grepl("formsite.com$", url_host) & grepl("[a-zA-Z0-9]/index\\.html$", url_path)) |
      (grepl("unipark.de$", url_host) & grepl("^/uc/[a-zA-Z0-9]", url_path)) |
      (grepl("typeform.com$", url_host) & grepl("^/to/[a-zA-Z0-9]", url_path)) |
      (grepl("formstack.com$", url_host) & grepl("^/forms/(?![a-zA-Z0-9]*index\\.php)", url_path, perl = T)) |
      (grepl("zohopublic.com$", url_host) & grepl("^/zs/[a-zA-Z0-9]", url_path)) |
      (grepl("zohopublic.eu$", url_host) & grepl("^/zs/[a-zA-Z0-9]", url_path))) %>%
  # create a url-type variable that defines a unique questionnaire
  # in most cases, this is the URL minus the query
  # for some questionpro.com cases, we need to integrate some of the query
  mutate(url_quest = ifelse(
    grepl("questionpro.com$", url_host) & grepl("/a/TakeSurvey\\?tt=[a-zA-Z0-9]", page_url),
    paste0(url_protocol, "//", url_host, url_path, "?", str_extract(url_query, ".+?(?=\\&)")),
    paste0(url_protocol, "//", url_host, url_path))) %>%
  select(caseid, created_utc, duration_s_5_na, same_as_prev_url, url_quest, page_url)

# exported as repeated_participation_YG.csv (not not contained in reproduction repository as raw data)
# write.csv(rep_participation_YG, "repeated_participation_YG.csv", row.names = F)
  